API Evolution and Breaking Changes Analysis¶
This notebook analyzes the dataset in data/, where each library has:
library-commits.csv: commit-level project/API metricslibrary-bcs.csv: per-breaking-change records
It focuses on commit dynamics, API growth, breaking-change intensity, compatibility impact, and the role of excluded/internal/deprecated symbols.
from pathlib import Path
import math
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
sns.set_theme(style="whitegrid", context="talk")
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 220)
DATA_DIR = Path("data")
if not DATA_DIR.exists():
raise FileNotFoundError("Expected directory 'data/' with split CSV files.")
print(f"Using data directory: {DATA_DIR.resolve()}")
print(f"pandas={pd.__version__}, seaborn={sns.__version__}")
Using data directory: /home/dig/repositories/roseau-full-bench/results/use_case/walk/data pandas=2.3.3, seaborn=0.13.2
BOOL_COLS_COMMITS = ["is_merge_commit", "has_java_changes", "has_pom_changes"]
BOOL_COLS_BCS = ["is_excluded_symbol", "is_deprecated_removal", "is_internal_removal"]
NUMERIC_COLS_COMMITS = [
"days_since_prev_commit", "files_changed", "loc_added", "loc_deleted",
"all_api_types_count", "all_api_methods_count", "all_api_fields_count", "all_api_symbols_count",
"exported_types_count", "exported_methods_count", "exported_fields_count",
"deprecated_count", "internal_count",
"breaking_changes_count", "binary_breaking_changes_count", "source_breaking_changes_count",
"checkout_time_ms", "classpath_time_ms", "api_time_ms", "diff_time_ms", "stats_time_ms",
]
def parse_bool_series(s: pd.Series) -> pd.Series:
vals = s.astype(str).str.lower().map({"true": True, "false": False})
return vals.eq(True)
def load_split_data(data_dir: Path):
commits_frames = []
bcs_frames = []
commit_files = sorted(data_dir.glob("*-commits.csv"))
bc_files = sorted(data_dir.glob("*-bcs.csv"))
libs_from_commits = {p.name.replace("-commits.csv", "") for p in commit_files}
libs_from_bcs = {p.name.replace("-bcs.csv", "") for p in bc_files}
missing_pairs = sorted(libs_from_commits.symmetric_difference(libs_from_bcs))
if missing_pairs:
print("Warning: some libraries do not have both files:", missing_pairs)
for path in commit_files:
lib = path.name.replace("-commits.csv", "")
df = pd.read_csv(path)
if "library" not in df.columns:
df["library"] = lib
df["date_utc"] = pd.to_datetime(df["date_utc"], errors="coerce", utc=True)
df = df.dropna(subset=["date_utc"]).sort_values("date_utc").reset_index(drop=True)
for c in NUMERIC_COLS_COMMITS:
if c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
for c in BOOL_COLS_COMMITS:
if c in df.columns:
df[c] = parse_bool_series(df[c])
# Derived commit-level features
df["net_loc"] = df["loc_added"] - df["loc_deleted"]
df["abs_loc_churn"] = df["loc_added"] + df["loc_deleted"]
df["exported_symbols_count"] = (
df["exported_types_count"] + df["exported_methods_count"] + df["exported_fields_count"]
)
df["all_symbols_count"] = (
df["all_api_types_count"] + df["all_api_methods_count"] + df["all_api_fields_count"]
)
df["breaks_per_1k_exported"] = (
df["breaking_changes_count"] / df["exported_symbols_count"].replace(0, np.nan) * 1000
).replace([np.inf, -np.inf], np.nan).fillna(0)
df["internal_share"] = (
df["internal_count"] / df["all_symbols_count"].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan).fillna(0)
df["deprecated_share"] = (
df["deprecated_count"] / df["exported_symbols_count"].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan).fillna(0)
commits_frames.append(df)
for path in bc_files:
lib = path.name.replace("-bcs.csv", "")
df = pd.read_csv(path)
if "library" not in df.columns:
df["library"] = lib
for c in BOOL_COLS_BCS:
if c in df.columns:
df[c] = parse_bool_series(df[c])
bcs_frames.append(df)
commits = pd.concat(commits_frames, ignore_index=True) if commits_frames else pd.DataFrame()
bcs = pd.concat(bcs_frames, ignore_index=True) if bcs_frames else pd.DataFrame()
return commits, bcs, sorted(libs_from_commits.intersection(libs_from_bcs)), missing_pairs
commits_df, bcs_df, libraries, missing_pairs = load_split_data(DATA_DIR)
print(f"Libraries with complete pairs: {len(libraries)}")
print(f"Commit rows: {len(commits_df):,}")
print(f"Breaking-change rows: {len(bcs_df):,}")
if missing_pairs:
print("Libraries missing one side:", missing_pairs)
Libraries with complete pairs: 35 Commit rows: 146,334 Breaking-change rows: 658,235
Data Quality and Coverage¶
coverage = commits_df.groupby("library").agg(
commits=("commit_sha", "count"),
first_commit=("date_utc", "min"),
last_commit=("date_utc", "max"),
commits_with_breaks=("breaking_changes_count", lambda s: int((s > 0).sum())),
total_reported_breaks=("breaking_changes_count", "sum"),
final_exported_symbols=("exported_symbols_count", "last"),
final_internal_symbols=("internal_count", "last"),
merge_commit_rate=("is_merge_commit", "mean"),
java_change_rate=("has_java_changes", "mean"),
).sort_values("total_reported_breaks", ascending=False)
# Cross-check: commit-level break count vs row-level BC records
bc_counts_from_rows = bcs_df.groupby("library").size().rename("bc_rows")
coverage = coverage.join(bc_counts_from_rows, how="left").fillna({"bc_rows": 0})
coverage["bc_count_delta"] = coverage["total_reported_breaks"] - coverage["bc_rows"]
coverage.head(20)
| commits | first_commit | last_commit | commits_with_breaks | total_reported_breaks | final_exported_symbols | final_internal_symbols | merge_commit_rate | java_change_rate | bc_rows | bc_count_delta | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| library | |||||||||||
| assertj-core | 3672 | 2010-09-07 04:06:36+00:00 | 2026-02-07 22:12:04+00:00 | 1001 | 230771 | 6932 | 85 | 0.049564 | 0.512527 | 230771 | 0 |
| jackson-databind | 6582 | 2011-12-23 08:31:35+00:00 | 2026-02-08 04:52:24+00:00 | 2007 | 145244 | 9122 | 0 | 0.421908 | 0.822546 | 145244 | 0 |
| rxjava-core | 3499 | 2013-01-09 06:21:43+00:00 | 2026-02-06 08:29:11+00:00 | 541 | 128110 | 4254 | 1850 | 0.335810 | 0.661332 | 128110 | 0 |
| hibernate-core | 4826 | 2010-10-11 19:41:47+00:00 | 2016-08-10 05:41:04+00:00 | 846 | 34244 | 25810 | 9464 | 0.016162 | 0.846042 | 34244 | 0 |
| h2-database | 8024 | 2006-12-15 00:12:44+00:00 | 2026-01-31 05:09:18+00:00 | 1384 | 34007 | 10719 | 0 | 0.192547 | 0.871386 | 34007 | 0 |
| tomcat | 27700 | 2006-03-27 13:53:46+00:00 | 2026-02-06 22:39:49+00:00 | 3287 | 33514 | 23836 | 0 | 0.002310 | 0.750433 | 33514 | 0 |
| commons-collections | 4856 | 2001-04-14 15:38:58+00:00 | 2026-02-06 14:23:17+00:00 | 406 | 9963 | 3713 | 0 | 0.022035 | 0.678336 | 9963 | 0 |
| jackson-core | 2572 | 2011-12-23 07:00:40+00:00 | 2026-02-07 21:49:29+00:00 | 424 | 7384 | 3002 | 0 | 0.422628 | 0.683904 | 7384 | 0 |
| joda-time | 2132 | 2003-12-16 21:39:27+00:00 | 2026-01-21 20:45:38+00:00 | 178 | 3676 | 3045 | 0 | 0.043152 | 0.568011 | 3676 | 0 |
| netty-codec-http | 9661 | 2011-12-28 10:44:04+00:00 | 2026-02-06 10:10:55+00:00 | 178 | 3392 | 2654 | 0 | 0.008281 | 0.840182 | 3392 | 0 |
| commons-pool | 2804 | 2001-04-14 16:40:29+00:00 | 2026-02-06 14:39:30+00:00 | 240 | 3350 | 609 | 0 | 0.016762 | 0.529601 | 3350 | 0 |
| httpcomponents-client | 3170 | 2009-03-01 16:36:52+00:00 | 2026-02-07 18:00:09+00:00 | 264 | 3116 | 2382 | 77 | 0.004416 | 0.782334 | 3116 | 0 |
| commons-beanutils | 2196 | 2001-03-27 05:25:57+00:00 | 2026-02-06 14:21:23+00:00 | 133 | 2285 | 758 | 0 | 0.013661 | 0.518215 | 2285 | 0 |
| commons-lang | 8614 | 2002-07-19 03:35:56+00:00 | 2026-02-07 20:01:43+00:00 | 396 | 2274 | 4284 | 0 | 0.024379 | 0.717088 | 2274 | 0 |
| guava | 6972 | 2011-04-15 17:22:23+00:00 | 2026-02-07 00:34:41+00:00 | 378 | 2037 | 5074 | 39 | 0.000143 | 0.897734 | 2037 | 0 |
| fastjson2-core | 4597 | 2022-04-17 05:16:05+00:00 | 2026-02-08 00:22:56+00:00 | 367 | 2000 | 4910 | 452 | 0.048727 | 0.774636 | 2000 | 0 |
| fastjson-core | 2934 | 2011-07-31 12:05:24+00:00 | 2023-05-12 06:16:03+00:00 | 322 | 1829 | 2124 | 0 | 0.140082 | 0.914110 | 1829 | 0 |
| log4j-api | 9835 | 2013-08-26 12:21:54+00:00 | 2026-01-22 10:08:42+00:00 | 145 | 1546 | 2200 | 90 | 0.032028 | 0.614540 | 1546 | 0 |
| commons-compress | 5436 | 2003-11-23 20:07:47+00:00 | 2026-02-06 14:23:42+00:00 | 213 | 1370 | 3706 | 27 | 0.017476 | 0.733996 | 1370 | 0 |
| commons-io | 5560 | 2002-01-26 02:47:42+00:00 | 2026-02-07 21:09:27+00:00 | 187 | 1209 | 2505 | 0 | 0.016187 | 0.680576 | 1209 | 0 |
fig, axes = plt.subplots(1, 2, figsize=(18, max(7, 0.45 * len(coverage))))
commits_plot = coverage.sort_values("commits", ascending=False).reset_index()
sns.barplot(data=commits_plot, y="library", x="commits", ax=axes[0], color="#2a9d8f")
axes[0].set_title("Libraries by Commit Count")
axes[0].set_xlabel("Commits analyzed")
axes[0].set_ylabel("")
# Use row-level BC counts for direct event volume comparison
bcs_plot = coverage.sort_values("bc_rows", ascending=False).reset_index()
sns.barplot(data=bcs_plot, y="library", x="bc_rows", ax=axes[1], color="#e76f51")
axes[1].set_title("Libraries by Breaking-Change Events")
axes[1].set_xlabel("Breaking-change records")
axes[1].set_ylabel("")
plt.tight_layout()
plt.show()
Per-Library Timeline: API Size and Breaking Changes¶
- API size evolution (
all_api_symbols_countandexported_symbols_count) - breaking-change introductions as vertical red bars at commit timestamps
# Per-library timeline plot (API size + vertical BC bars per commit)
libs_order = coverage.sort_values("commits", ascending=False).index.tolist()
for lib in libs_order:
d = commits_df[commits_df["library"] == lib].sort_values("date_utc").copy()
if d.empty:
continue
fig, ax = plt.subplots(figsize=(14, 4.5))
# API size evolution
ax.plot(d["date_utc"], d["all_api_symbols_count"], color="#1d3557", linewidth=1.6, label="All API symbols")
ax.plot(d["date_utc"], d["exported_symbols_count"], color="#2a9d8f", linewidth=1.6, label="Exported symbols")
# Vertical red bars for commits with breaking changes
bc_commits = d[d["breaking_changes_count"] > 0]
if not bc_commits.empty:
ymax = max(float(d["all_api_symbols_count"].max()), 1.0)
# bar height scales with break count to preserve intensity signal
bar_top = (bc_commits["breaking_changes_count"] / bc_commits["breaking_changes_count"].max()) * ymax
ax.vlines(
bc_commits["date_utc"],
ymin=0,
ymax=bar_top,
color="red",
alpha=0.45,
linewidth=1.0,
label="Breaking change commit" if len(bc_commits) else None,
)
ax.set_title(f"{lib}: API evolution and breaking-change introductions")
ax.set_xlabel("Date")
ax.set_ylabel("Symbol count")
ax.grid(alpha=0.25)
handles, labels = ax.get_legend_handles_labels()
# deduplicate legend labels
uniq = dict(zip(labels, handles))
ax.legend(uniq.values(), uniq.keys(), loc="upper left")
plt.tight_layout()
plt.show()
plt.close(fig)
API Evolution and Break Risk¶
# Libraries with largest final exported APIs
focus_libs = (
coverage.sort_values("final_exported_symbols", ascending=False)
.index
.tolist()
)
fig, axes = plt.subplots(18, 2, figsize=(18, 100), sharex=False)
axes = axes.flatten()
for i, lib in enumerate(focus_libs):
d = commits_df[commits_df["library"] == lib].copy().sort_values("date_utc")
if d.empty:
axes[i].axis("off")
continue
d["rolling_break_risk"] = d["breaks_per_1k_exported"].rolling(window=60, min_periods=10).mean()
ax = axes[i]
ax.plot(d["date_utc"], d["exported_symbols_count"], label="Exported symbols", color="#264653", linewidth=2)
ax.plot(d["date_utc"], d["internal_count"], label="Internal symbols", color="#8ab17d", linewidth=1.6)
ax.set_title(lib)
ax.set_ylabel("Symbol count")
ax.grid(alpha=0.25)
ax2 = ax.twinx()
ax2.plot(d["date_utc"], d["rolling_break_risk"], label="Rolling breaks/1k exported", color="#e76f51", linewidth=1.6)
ax2.set_ylabel("Breaks per 1k")
lines, labels = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(lines + lines2, labels + labels2, loc="upper left", fontsize=9)
for j in range(len(focus_libs), len(axes)):
axes[j].axis("off")
plt.suptitle("API size evolution and normalized breaking-change risk", y=1.02)
plt.tight_layout()
plt.show()
sample = commits_df[["library", "abs_loc_churn", "files_changed", "breaking_changes_count", "breaks_per_1k_exported"]].copy()
if len(sample) > 100_000:
sample = sample.sample(100_000, random_state=42)
fig, axes = plt.subplots(1, 2, figsize=(18, 7))
sns.scatterplot(
data=sample,
x="abs_loc_churn",
y="breaking_changes_count",
hue="library",
alpha=0.25,
linewidth=0,
s=18,
legend=False,
ax=axes[0],
)
axes[0].set_xscale("symlog", linthresh=1)
axes[0].set_yscale("symlog", linthresh=1)
axes[0].set_title("Code churn vs breaking changes per commit")
axes[0].set_xlabel("LOC churn (added + deleted)")
axes[0].set_ylabel("Breaking changes")
sns.scatterplot(
data=sample,
x="files_changed",
y="breaks_per_1k_exported",
hue="library",
alpha=0.25,
linewidth=0,
s=18,
legend=False,
ax=axes[1],
)
axes[1].set_xscale("symlog", linthresh=1)
axes[1].set_yscale("symlog", linthresh=0.1)
axes[1].set_title("Files changed vs normalized break risk")
axes[1].set_xlabel("Files changed")
axes[1].set_ylabel("Breaks per 1k exported symbols")
plt.tight_layout()
plt.show()
Breaking-Change Taxonomy¶
# Top kind distribution by library
kind_counts = (
bcs_df.groupby(["library", "kind"]).size().rename("count").reset_index()
)
top_kinds = bcs_df["kind"].value_counts().head(15).index
kind_matrix = (
kind_counts[kind_counts["kind"].isin(top_kinds)]
.pivot(index="library", columns="kind", values="count")
.fillna(0)
)
kind_matrix = kind_matrix.loc[kind_matrix.sum(axis=1).sort_values(ascending=False).index]
plt.figure(figsize=(18, 20))
sns.heatmap(np.log1p(kind_matrix), cmap="YlOrRd", cbar_kws={"label": "log(1 + BC events)"})
plt.title("Breaking-change kind intensity by library (top 15 kinds)")
plt.xlabel("Breaking-change kind")
plt.ylabel("Library")
plt.tight_layout()
plt.show()
compat = (
bcs_df.groupby(["library", "compatibility"]).size().rename("count").reset_index()
)
compat["share"] = compat.groupby("library")["count"].transform(lambda s: s / s.sum())
top_libs_for_compat = coverage.sort_values("bc_rows", ascending=False).head(12).index
plot_compat = compat[compat["library"].isin(top_libs_for_compat)]
plt.figure(figsize=(18, 8))
sns.barplot(data=plot_compat, x="library", y="share", hue="compatibility")
plt.title("Compatibility profile by library (top 12 by BC count)")
plt.xlabel("Library")
plt.ylabel("Share of breaking changes")
plt.xticks(rotation=70, ha="right")
plt.tight_layout()
plt.show()
Excluded, Internal, and Deprecated-Removal Signals¶
flags = (
bcs_df.groupby("library").agg(
total_bcs=("kind", "count"),
excluded_bcs=("is_excluded_symbol", "sum"),
internal_removals=("is_internal_removal", "sum"),
deprecated_removals=("is_deprecated_removal", "sum"),
)
.reset_index()
)
for col in ["excluded_bcs", "internal_removals", "deprecated_removals"]:
flags[f"{col}_share"] = (flags[col] / flags["total_bcs"].replace(0, np.nan)).fillna(0)
flag_plot = flags.sort_values("total_bcs", ascending=False)
long_flag = flag_plot.melt(
id_vars=["library", "total_bcs"],
value_vars=["excluded_bcs_share", "internal_removals_share", "deprecated_removals_share"],
var_name="metric",
value_name="share",
)
name_map = {
"excluded_bcs_share": "Excluded symbol BC share",
"internal_removals_share": "Internal removal share",
"deprecated_removals_share": "Deprecated-removal share",
}
long_flag["metric"] = long_flag["metric"].map(name_map)
plt.figure(figsize=(20, 8))
sns.barplot(data=long_flag, x="library", y="share", hue="metric")
plt.title("Excluded/internal/deprecated-removal ratios by library")
plt.xlabel("Library")
plt.ylabel("Share of breaking-change events")
plt.xticks(rotation=70, ha="right")
plt.tight_layout()
plt.show()
flags.sort_values("excluded_bcs_share", ascending=False).head(10)
| library | total_bcs | excluded_bcs | internal_removals | deprecated_removals | excluded_bcs_share | internal_removals_share | deprecated_removals_share | |
|---|---|---|---|---|---|---|---|---|
| 32 | rxjava-core | 128110 | 100935 | 2773 | 1879 | 0.787878 | 0.021645 | 0.014667 |
| 15 | gson | 470 | 353 | 121 | 0 | 0.751064 | 0.257447 | 0.000000 |
| 16 | guava | 2037 | 1488 | 189 | 278 | 0.730486 | 0.092784 | 0.136475 |
| 19 | hibernate-core | 34244 | 5369 | 2727 | 730 | 0.156787 | 0.079634 | 0.021318 |
| 14 | fastjson2-core | 2000 | 151 | 137 | 3 | 0.075500 | 0.068500 | 0.001500 |
| 1 | assertj-core | 230771 | 9699 | 6993 | 1397 | 0.042029 | 0.030303 | 0.006054 |
| 28 | log4j-api | 1546 | 64 | 12 | 1 | 0.041397 | 0.007762 | 0.000647 |
| 20 | httpcomponents-client | 3116 | 124 | 22 | 214 | 0.039795 | 0.007060 | 0.068678 |
| 18 | hamcrest-core | 435 | 8 | 8 | 1 | 0.018391 | 0.018391 | 0.002299 |
| 27 | jsoup | 919 | 14 | 8 | 51 | 0.015234 | 0.008705 | 0.055495 |
vis = (
bcs_df.groupby(["library", "symbol_visibility"]).size().rename("count").reset_index()
)
vis["share"] = vis.groupby("library")["count"].transform(lambda s: s / s.sum())
top_libs = coverage.sort_values("bc_rows", ascending=False).head(10).index
vis_plot = vis[vis["library"].isin(top_libs)]
plt.figure(figsize=(18, 8))
sns.barplot(data=vis_plot, x="library", y="share", hue="symbol_visibility")
plt.title("Visibility of impacted symbols in breaking changes")
plt.xlabel("Library")
plt.ylabel("Share")
plt.xticks(rotation=65, ha="right")
plt.tight_layout()
plt.show()
Temporal Patterns and Concentration¶
monthly_commits = commits_df.copy()
monthly_commits["month"] = monthly_commits["date_utc"].dt.tz_convert(None).dt.to_period("M").astype(str)
monthly = monthly_commits.groupby(["library", "month"]).agg(
monthly_commits=("commit_sha", "count"),
monthly_breaks=("breaking_changes_count", "sum"),
monthly_abs_churn=("abs_loc_churn", "sum"),
).reset_index()
monthly["breaks_per_100_commits"] = (monthly["monthly_breaks"] / monthly["monthly_commits"].replace(0, np.nan) * 100).fillna(0)
# Heatmap of monthly break intensity for most active libraries
active_libs = coverage.sort_values("commits", ascending=False).index
heat = monthly[monthly["library"].isin(active_libs)].pivot(index="library", columns="month", values="breaks_per_100_commits").fillna(0)
plt.figure(figsize=(22, 18))
sns.heatmap(np.log1p(heat), cmap="rocket_r", cbar_kws={"label": "log(1 + breaks per 100 commits)"})
plt.title("Temporal break intensity (most active libraries)")
plt.xlabel("Month")
plt.ylabel("Library")
plt.tight_layout()
plt.show()
# Pareto concentration: how many commits explain 80% of breaking changes in each library
pareto_rows = []
for lib, d in commits_df.groupby("library"):
s = d["breaking_changes_count"].sort_values(ascending=False).reset_index(drop=True)
total = s.sum()
if total <= 0:
pareto_rows.append({"library": lib, "commits_for_80pct_breaks": np.nan, "total_commits": len(d)})
continue
csum = s.cumsum()
needed = int((csum < (0.8 * total)).sum() + 1)
pareto_rows.append({"library": lib, "commits_for_80pct_breaks": needed, "total_commits": len(d)})
pareto = pd.DataFrame(pareto_rows)
pareto["share_of_commits_for_80pct_breaks"] = (
pareto["commits_for_80pct_breaks"] / pareto["total_commits"].replace(0, np.nan)
)
pareto = pareto.sort_values("share_of_commits_for_80pct_breaks")
plt.figure(figsize=(16, 7))
sns.barplot(data=pareto.dropna(subset=["share_of_commits_for_80pct_breaks"]), x="library", y="share_of_commits_for_80pct_breaks", color="#457b9d")
plt.title("Concentration of breaking changes (lower means more concentrated)")
plt.xlabel("Library")
plt.ylabel("Commit share needed to accumulate 80% of breaks")
plt.xticks(rotation=70, ha="right")
plt.tight_layout()
plt.show()
pareto.head(15)
| library | commits_for_80pct_breaks | total_commits | share_of_commits_for_80pct_breaks | |
|---|---|---|---|---|
| 0 | JSON-java | 1 | 389 | 0.002571 |
| 28 | log4j-api | 26 | 9835 | 0.002644 |
| 24 | jakartaee-servlet-api | 2 | 553 | 0.003617 |
| 29 | netty-codec-http | 43 | 9661 | 0.004451 |
| 12 | commons-text | 14 | 2322 | 0.006029 |
| 30 | protobuf-java | 10 | 1151 | 0.008688 |
| 4 | commons-cli | 16 | 1838 | 0.008705 |
| 8 | commons-io | 50 | 5560 | 0.008993 |
| 7 | commons-compress | 59 | 5436 | 0.010854 |
| 10 | commons-logging | 21 | 1826 | 0.011501 |
| 3 | commons-beanutils | 26 | 2196 | 0.011840 |
| 5 | commons-codec | 41 | 3172 | 0.012926 |
| 23 | jakartaee-jaxrs-api | 5 | 380 | 0.013158 |
| 6 | commons-collections | 65 | 4856 | 0.013386 |
| 33 | slf4j-api | 22 | 1638 | 0.013431 |
Hotspots in Impacted Packages and Types¶
pkg_hotspots = (
bcs_df.dropna(subset=["impacted_package_fqn"])
.groupby(["library", "impacted_package_fqn"]).size().rename("count").reset_index()
.sort_values("count", ascending=False)
)
type_hotspots = (
bcs_df.dropna(subset=["impacted_type_fqn"])
.groupby(["library", "impacted_type_fqn"]).size().rename("count").reset_index()
.sort_values("count", ascending=False)
)
print("Top impacted packages across all libraries:")
display(pkg_hotspots.head(20))
print("Top impacted types across all libraries:")
display(type_hotspots.head(20))
Top impacted packages across all libraries:
| library | impacted_package_fqn | count | |
|---|---|---|---|
| 77 | assertj-core | org.assertj.core.api | 180600 |
| 1215 | jackson-databind | com.fasterxml.jackson.databind.deser.std | 57955 |
| 1371 | rxjava-core | io.reactivex.internal.operators.observable | 39001 |
| 1367 | rxjava-core | io.reactivex.internal.operators.flowable | 27820 |
| 1402 | rxjava-core | io.reactivex.rxjava3.internal.operators.flowable | 12002 |
| 1210 | jackson-databind | com.fasterxml.jackson.databind.deser | 11785 |
| 1247 | jackson-databind | tools.jackson.databind.deser.jdk | 10886 |
| 1212 | jackson-databind | com.fasterxml.jackson.databind.deser.impl | 8228 |
| 1091 | hibernate-core | org.hibernate.type | 7595 |
| 1217 | jackson-databind | com.fasterxml.jackson.databind.ext | 7092 |
| 1447 | rxjava-core | rx.subjects | 6323 |
| 278 | assertj-core | org.fest.assertions.api | 6216 |
| 1235 | jackson-databind | com.fasterxml.jackson.databind.ser.std | 5951 |
| 1420 | rxjava-core | io.reactivex.subjects | 5095 |
| 715 | h2-database | org.h2.value | 4518 |
| 1373 | rxjava-core | io.reactivex.internal.operators.single | 4395 |
| 220 | assertj-core | org.assertj.core.condition | 4086 |
| 1366 | rxjava-core | io.reactivex.internal.operators.completable | 3793 |
| 128 | assertj-core | org.assertj.core.api.objectarray | 3595 |
| 1219 | jackson-databind | com.fasterxml.jackson.databind.ext.jdk8 | 3207 |
Top impacted types across all libraries:
| library | impacted_type_fqn | count | |
|---|---|---|---|
| 1764 | assertj-core | org.assertj.core.api.BDDAssertions | 6043 |
| 1646 | assertj-core | org.assertj.core.api.Assertions | 4219 |
| 20579 | jackson-databind | com.fasterxml.jackson.databind.deser.BeanDeser... | 3500 |
| 20852 | jackson-databind | com.fasterxml.jackson.databind.deser.std.Throw... | 3421 |
| 2029 | assertj-core | org.assertj.core.api.WithAssertions | 2934 |
| 20585 | jackson-databind | com.fasterxml.jackson.databind.deser.BuilderBa... | 2911 |
| 20580 | jackson-databind | com.fasterxml.jackson.databind.deser.BeanDeser... | 2726 |
| 1597 | assertj-core | org.assertj.core.api.AbstractIterableAssert | 2710 |
| 1600 | assertj-core | org.assertj.core.api.AbstractListAssert | 2648 |
| 20649 | jackson-databind | com.fasterxml.jackson.databind.deser.impl.Bean... | 2601 |
| 1952 | assertj-core | org.assertj.core.api.ObjectArrayAssert | 2530 |
| 20648 | jackson-databind | com.fasterxml.jackson.databind.deser.impl.Bean... | 2492 |
| 1824 | assertj-core | org.assertj.core.api.ConcreteIterableAssert | 2472 |
| 1892 | assertj-core | org.assertj.core.api.IterableAssert | 2395 |
| 1925 | assertj-core | org.assertj.core.api.ListAssert | 2323 |
| 1610 | assertj-core | org.assertj.core.api.AbstractObjectArrayAssert | 2302 |
| 1855 | assertj-core | org.assertj.core.api.FactoryBasedNavigableIter... | 1939 |
| 1811 | assertj-core | org.assertj.core.api.ClassBasedNavigableIterab... | 1938 |
| 1856 | assertj-core | org.assertj.core.api.FactoryBasedNavigableList... | 1906 |
| 1911 | assertj-core | org.assertj.core.api.Java6Assertions | 1889 |
Consolidated Summary Table¶
summary = coverage.reset_index().merge(
flags[["library", "excluded_bcs_share", "internal_removals_share", "deprecated_removals_share"]],
on="library",
how="left",
).merge(
pareto[["library", "share_of_commits_for_80pct_breaks"]],
on="library",
how="left",
)
summary["breaks_per_1k_final_exported"] = (
summary["total_reported_breaks"] / summary["final_exported_symbols"].replace(0, np.nan) * 1000
).fillna(0)
summary = summary.sort_values(["total_reported_breaks", "commits"], ascending=[False, False])
summary.head(25)
| library | commits | first_commit | last_commit | commits_with_breaks | total_reported_breaks | final_exported_symbols | final_internal_symbols | merge_commit_rate | java_change_rate | bc_rows | bc_count_delta | excluded_bcs_share | internal_removals_share | deprecated_removals_share | share_of_commits_for_80pct_breaks | breaks_per_1k_final_exported | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | assertj-core | 3672 | 2010-09-07 04:06:36+00:00 | 2026-02-07 22:12:04+00:00 | 1001 | 230771 | 6932 | 85 | 0.049564 | 0.512527 | 230771 | 0 | 0.042029 | 0.030303 | 0.006054 | 0.043845 | 33290.680900 |
| 1 | jackson-databind | 6582 | 2011-12-23 08:31:35+00:00 | 2026-02-08 04:52:24+00:00 | 2007 | 145244 | 9122 | 0 | 0.421908 | 0.822546 | 145244 | 0 | 0.000103 | 0.000103 | 0.020669 | 0.024157 | 15922.385442 |
| 2 | rxjava-core | 3499 | 2013-01-09 06:21:43+00:00 | 2026-02-06 08:29:11+00:00 | 541 | 128110 | 4254 | 1850 | 0.335810 | 0.661332 | 128110 | 0 | 0.787878 | 0.021645 | 0.014667 | 0.014290 | 30115.185708 |
| 3 | hibernate-core | 4826 | 2010-10-11 19:41:47+00:00 | 2016-08-10 05:41:04+00:00 | 846 | 34244 | 25810 | 9464 | 0.016162 | 0.846042 | 34244 | 0 | 0.156787 | 0.079634 | 0.021318 | 0.018027 | 1326.772569 |
| 4 | h2-database | 8024 | 2006-12-15 00:12:44+00:00 | 2026-01-31 05:09:18+00:00 | 1384 | 34007 | 10719 | 0 | 0.192547 | 0.871386 | 34007 | 0 | 0.000000 | 0.000000 | 0.000706 | 0.018569 | 3172.590727 |
| 5 | tomcat | 27700 | 2006-03-27 13:53:46+00:00 | 2026-02-06 22:39:49+00:00 | 3287 | 33514 | 23836 | 0 | 0.002310 | 0.750433 | 33514 | 0 | 0.000000 | 0.000000 | 0.054336 | 0.023899 | 1406.024501 |
| 6 | commons-collections | 4856 | 2001-04-14 15:38:58+00:00 | 2026-02-06 14:23:17+00:00 | 406 | 9963 | 3713 | 0 | 0.022035 | 0.678336 | 9963 | 0 | 0.000000 | 0.000000 | 0.001807 | 0.013386 | 2683.274980 |
| 7 | jackson-core | 2572 | 2011-12-23 07:00:40+00:00 | 2026-02-07 21:49:29+00:00 | 424 | 7384 | 3002 | 0 | 0.422628 | 0.683904 | 7384 | 0 | 0.000000 | 0.000000 | 0.044556 | 0.027994 | 2459.693538 |
| 8 | joda-time | 2132 | 2003-12-16 21:39:27+00:00 | 2026-01-21 20:45:38+00:00 | 178 | 3676 | 3045 | 0 | 0.043152 | 0.568011 | 3676 | 0 | 0.000000 | 0.000000 | 0.000000 | 0.018762 | 1207.224959 |
| 9 | netty-codec-http | 9661 | 2011-12-28 10:44:04+00:00 | 2026-02-06 10:10:55+00:00 | 178 | 3392 | 2654 | 0 | 0.008281 | 0.840182 | 3392 | 0 | 0.000000 | 0.000000 | 0.018868 | 0.004451 | 1278.070836 |
| 10 | commons-pool | 2804 | 2001-04-14 16:40:29+00:00 | 2026-02-06 14:39:30+00:00 | 240 | 3350 | 609 | 0 | 0.016762 | 0.529601 | 3350 | 0 | 0.000000 | 0.000000 | 0.045373 | 0.025321 | 5500.821018 |
| 11 | httpcomponents-client | 3170 | 2009-03-01 16:36:52+00:00 | 2026-02-07 18:00:09+00:00 | 264 | 3116 | 2382 | 77 | 0.004416 | 0.782334 | 3116 | 0 | 0.039795 | 0.007060 | 0.068678 | 0.014826 | 1308.144416 |
| 12 | commons-beanutils | 2196 | 2001-03-27 05:25:57+00:00 | 2026-02-06 14:21:23+00:00 | 133 | 2285 | 758 | 0 | 0.013661 | 0.518215 | 2285 | 0 | 0.000000 | 0.000000 | 0.041575 | 0.011840 | 3014.511873 |
| 13 | commons-lang | 8614 | 2002-07-19 03:35:56+00:00 | 2026-02-07 20:01:43+00:00 | 396 | 2274 | 4284 | 0 | 0.024379 | 0.717088 | 2274 | 0 | 0.000000 | 0.000000 | 0.000440 | 0.014163 | 530.812325 |
| 14 | guava | 6972 | 2011-04-15 17:22:23+00:00 | 2026-02-07 00:34:41+00:00 | 378 | 2037 | 5074 | 39 | 0.000143 | 0.897734 | 2037 | 0 | 0.730486 | 0.092784 | 0.136475 | 0.016495 | 401.458415 |
| 15 | fastjson2-core | 4597 | 2022-04-17 05:16:05+00:00 | 2026-02-08 00:22:56+00:00 | 367 | 2000 | 4910 | 452 | 0.048727 | 0.774636 | 2000 | 0 | 0.075500 | 0.068500 | 0.001500 | 0.026539 | 407.331976 |
| 16 | fastjson-core | 2934 | 2011-07-31 12:05:24+00:00 | 2023-05-12 06:16:03+00:00 | 322 | 1829 | 2124 | 0 | 0.140082 | 0.914110 | 1829 | 0 | 0.000000 | 0.000000 | 0.004921 | 0.035446 | 861.111111 |
| 17 | log4j-api | 9835 | 2013-08-26 12:21:54+00:00 | 2026-01-22 10:08:42+00:00 | 145 | 1546 | 2200 | 90 | 0.032028 | 0.614540 | 1546 | 0 | 0.041397 | 0.007762 | 0.000647 | 0.002644 | 702.727273 |
| 18 | commons-compress | 5436 | 2003-11-23 20:07:47+00:00 | 2026-02-06 14:23:42+00:00 | 213 | 1370 | 3706 | 27 | 0.017476 | 0.733996 | 1370 | 0 | 0.000730 | 0.000730 | 0.000000 | 0.010854 | 369.670804 |
| 19 | commons-io | 5560 | 2002-01-26 02:47:42+00:00 | 2026-02-07 21:09:27+00:00 | 187 | 1209 | 2505 | 0 | 0.016187 | 0.680576 | 1209 | 0 | 0.000000 | 0.000000 | 0.001654 | 0.008993 | 482.634731 |
| 20 | commons-cli | 1838 | 2002-06-10 18:01:16+00:00 | 2026-02-06 14:22:14+00:00 | 87 | 1062 | 512 | 0 | 0.026115 | 0.431447 | 1062 | 0 | 0.000000 | 0.000000 | 0.000000 | 0.008705 | 2074.218750 |
| 21 | commons-codec | 3172 | 2003-04-25 17:51:00+00:00 | 2026-02-07 14:04:49+00:00 | 146 | 932 | 824 | 0 | 0.018600 | 0.559269 | 932 | 0 | 0.000000 | 0.000000 | 0.008584 | 0.012926 | 1131.067961 |
| 22 | jsoup | 2163 | 2011-07-02 11:11:39+00:00 | 2026-02-03 05:07:10+00:00 | 142 | 919 | 1285 | 80 | 0.041147 | 0.714286 | 919 | 0 | 0.015234 | 0.008705 | 0.055495 | 0.023578 | 715.175097 |
| 23 | protobuf-java | 1151 | 2008-07-10 02:12:20+00:00 | 2016-10-20 00:33:59+00:00 | 28 | 562 | 1706 | 0 | 0.550825 | 0.099044 | 562 | 0 | 0.000000 | 0.000000 | 0.007117 | 0.008688 | 329.425557 |
| 24 | gson | 1977 | 2008-09-01 03:13:32+00:00 | 2026-02-01 17:06:30+00:00 | 108 | 470 | 340 | 0 | 0.070309 | 0.669196 | 470 | 0 | 0.751064 | 0.257447 | 0.000000 | 0.017198 | 1382.352941 |
Commit Context of Breaking Changes¶
This section links *-bcs.csv rows back to commit metadata to understand the context in which breaking changes are introduced.
# Link BC rows to commit-level context
commit_context_cols = [
"library", "commit_sha", "date_utc", "is_merge_commit", "has_java_changes", "has_pom_changes",
"files_changed", "abs_loc_churn", "days_since_prev_commit", "breaking_changes_count",
"binary_breaking_changes_count", "source_breaking_changes_count", "tag", "version", "branch",
"exported_symbols_count", "internal_share", "deprecated_share",
]
commit_ctx = commits_df[commit_context_cols].copy()
commit_ctx = commit_ctx.rename(columns={"commit_sha": "commit"})
bcs_enriched = bcs_df.merge(commit_ctx, on=["library", "commit"], how="left", validate="many_to_one")
# Commit-level BC event counts from row-level table
bc_rows_per_commit = (
bcs_enriched.groupby(["library", "commit"]).size().rename("bc_rows").reset_index()
)
commit_enriched = commit_ctx.merge(bc_rows_per_commit, on=["library", "commit"], how="left")
commit_enriched["bc_rows"] = commit_enriched["bc_rows"].fillna(0)
commit_enriched["has_bc_rows"] = commit_enriched["bc_rows"] > 0
# Useful derived features
commit_enriched["is_tagged_commit"] = commit_enriched["tag"].notna() | commit_enriched["version"].notna()
commit_enriched["binary_share_in_commit"] = (
commit_enriched["binary_breaking_changes_count"] /
commit_enriched["breaking_changes_count"].replace(0, np.nan)
).fillna(0)
print("Enriched BC rows:", len(bcs_enriched))
print("Commits with at least one BC row:", int(commit_enriched["has_bc_rows"].sum()))
print("Share of commits with BC rows:", round(commit_enriched["has_bc_rows"].mean(), 4))
Enriched BC rows: 658235 Commits with at least one BC row: 13868 Share of commits with BC rows: 0.0948
# Compare contexts: breaking vs non-breaking commits
context_summary = commit_enriched.groupby("has_bc_rows").agg(
commits=("commit", "count"),
median_files_changed=("files_changed", "median"),
median_abs_churn=("abs_loc_churn", "median"),
median_days_since_prev=("days_since_prev_commit", "median"),
merge_rate=("is_merge_commit", "mean"),
java_change_rate=("has_java_changes", "mean"),
pom_change_rate=("has_pom_changes", "mean"),
tagged_commit_rate=("is_tagged_commit", "mean"),
).reset_index()
context_summary["has_bc_rows"] = context_summary["has_bc_rows"].map({False: "No BC", True: "Has BC"})
context_summary
| has_bc_rows | commits | median_files_changed | median_abs_churn | median_days_since_prev | merge_rate | java_change_rate | pom_change_rate | tagged_commit_rate | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | No BC | 132466 | 1.0 | 14.0 | 0.0 | 0.071226 | 0.677593 | 0.115803 | 0.009361 |
| 1 | Has BC | 13868 | 5.0 | 133.0 | 0.0 | 0.135203 | 1.000000 | 0.024445 | 0.001947 |
# Break probability as a function of churn and inactivity
work = commit_enriched.copy()
work["churn_decile"] = pd.qcut(
work["abs_loc_churn"].rank(method="first"),
q=10,
labels=[f"D{i}" for i in range(1, 11)]
)
work["inactivity_bucket"] = pd.cut(
work["days_since_prev_commit"],
bins=[-0.1, 0, 1, 3, 7, 30, 365, np.inf],
labels=["0d", "1d", "2-3d", "4-7d", "8-30d", "31-365d", ">365d"],
)
p1 = work.groupby("churn_decile", observed=False)["has_bc_rows"].mean().reset_index(name="break_commit_rate")
p2 = work.groupby("inactivity_bucket", observed=False)["has_bc_rows"].mean().reset_index(name="break_commit_rate")
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.barplot(data=p1, x="churn_decile", y="break_commit_rate", color="#e76f51", ax=axes[0])
axes[0].set_title("Probability a commit has BCs by churn decile")
axes[0].set_xlabel("Churn decile")
axes[0].set_ylabel("P(commit has BC rows)")
sns.barplot(data=p2, x="inactivity_bucket", y="break_commit_rate", color="#264653", ax=axes[1])
axes[1].set_title("Probability a commit has BCs by inactivity")
axes[1].set_xlabel("Days since previous commit")
axes[1].set_ylabel("P(commit has BC rows)")
plt.tight_layout()
plt.show()
Release/Tag Proximity Effects¶
Here, commits with a non-null tag or version are treated as release markers. We analyze if BCs cluster near these points.
def nearest_release_distance_days(lib_df: pd.DataFrame) -> pd.Series:
d = lib_df.sort_values("date_utc").copy()
release_dates = d.loc[d["tag"].notna() | d["version"].notna(), "date_utc"].drop_duplicates().sort_values()
if release_dates.empty:
return pd.Series(np.nan, index=d.index)
release_ns = release_dates.astype("int64").to_numpy()
commit_ns = d["date_utc"].astype("int64").to_numpy()
idx = np.searchsorted(release_ns, commit_ns)
left_idx = np.clip(idx - 1, 0, len(release_ns) - 1)
right_idx = np.clip(idx, 0, len(release_ns) - 1)
left_dist = np.abs(commit_ns - release_ns[left_idx])
right_dist = np.abs(commit_ns - release_ns[right_idx])
min_dist_days = np.minimum(left_dist, right_dist) / 1e9 / 86400
out = pd.Series(min_dist_days, index=d.index)
return out.reindex(lib_df.index)
commit_enriched["days_to_nearest_release"] = np.nan
for _lib, _idx in commit_enriched.groupby("library").groups.items():
commit_enriched.loc[_idx, "days_to_nearest_release"] = nearest_release_distance_days(commit_enriched.loc[_idx]).values
release_bins = [-0.1, 0, 1, 3, 7, 30, 90, 365, np.inf]
release_labels = ["release day", "1d", "2-3d", "4-7d", "8-30d", "31-90d", "91-365d", ">365d"]
commit_enriched["release_proximity"] = pd.cut(
commit_enriched["days_to_nearest_release"], bins=release_bins, labels=release_labels
)
release_effect = commit_enriched.groupby("release_proximity", observed=False).agg(
commits=("commit", "count"),
break_commit_rate=("has_bc_rows", "mean"),
mean_breaks_per_commit=("bc_rows", "mean"),
).reset_index()
release_effect
| release_proximity | commits | break_commit_rate | mean_breaks_per_commit | |
|---|---|---|---|---|
| 0 | release day | 1277 | 0.021143 | 2.000000 |
| 1 | 1d | 7766 | 0.070306 | 3.789596 |
| 2 | 2-3d | 6090 | 0.105911 | 6.803777 |
| 3 | 4-7d | 8765 | 0.098688 | 6.581289 |
| 4 | 8-30d | 19751 | 0.098274 | 9.893778 |
| 5 | 31-90d | 15486 | 0.104223 | 9.643936 |
| 6 | 91-365d | 19885 | 0.059844 | 2.670405 |
| 7 | >365d | 39614 | 0.094739 | 2.417277 |
fig, axes = plt.subplots(1, 2, figsize=(17, 6))
sns.barplot(data=release_effect, x="release_proximity", y="break_commit_rate", color="#2a9d8f", ax=axes[0])
axes[0].set_title("Break-commit rate by distance to nearest release marker")
axes[0].set_xlabel("Distance to nearest release/tag commit")
axes[0].set_ylabel("P(commit has BC rows)")
axes[0].tick_params(axis="x", rotation=30)
sns.barplot(data=release_effect, x="release_proximity", y="mean_breaks_per_commit", color="#f4a261", ax=axes[1])
axes[1].set_title("Mean BC rows per commit by release proximity")
axes[1].set_xlabel("Distance to nearest release/tag commit")
axes[1].set_ylabel("Mean BC rows per commit")
axes[1].tick_params(axis="x", rotation=30)
plt.tight_layout()
plt.show()
Commit Intent and Evolution Style¶
We derive lightweight intent buckets from commit messages and conventional tags to compare evolution styles and BC policies.
def infer_intent(row):
tag = str(row.get("conventional_commit_tag", "") or "").strip().lower()
msg = str(row.get("commit_short_msg", "") or "").strip().lower()
text = f"{tag} {msg}"
rules = [
("release", ["release", "version", "bump version", "prepare", "tag"]),
("revert", ["revert"]),
("deps", ["dependenc", "upgrade", "bump", "renovate"]),
("build", ["pom", "gradle", "maven", "build", "ci", "workflow"]),
("docs", ["readme", "doc", "javadoc", "documentation"]),
("test", ["test", "assert", "spec"]),
("fix", ["fix", "bug", "issue", "patch", "hotfix"]),
("refactor", ["refactor", "cleanup", "simplify", "rename"]),
("feature", ["feat", "feature", "add", "introduce", "implement"]),
("breaking", ["breaking", "remove", "drop", "deprecat", "api change"]),
]
for label, kws in rules:
if any(k in text for k in kws):
return label
return "other"
intent_df = commit_enriched.merge(
commits_df[["library", "commit_sha", "commit_short_msg", "conventional_commit_tag"]],
left_on=["library", "commit"],
right_on=["library", "commit_sha"],
how="left",
)
intent_df["intent"] = intent_df.apply(infer_intent, axis=1)
intent_stats = intent_df.groupby(["library", "intent"]).agg(
commits=("commit", "count"),
break_commit_rate=("has_bc_rows", "mean"),
mean_bc_rows=("bc_rows", "mean"),
).reset_index()
intent_stats["intent_share"] = intent_stats.groupby("library")["commits"].transform(lambda s: s / s.sum())
intent_stats.head(20)
| library | intent | commits | break_commit_rate | mean_bc_rows | intent_share | |
|---|---|---|---|---|---|---|
| 0 | JSON-java | breaking | 7 | 0.142857 | 0.142857 | 0.017995 |
| 1 | JSON-java | build | 21 | 0.047619 | 7.142857 | 0.053985 |
| 2 | JSON-java | deps | 2 | 0.000000 | 0.000000 | 0.005141 |
| 3 | JSON-java | docs | 42 | 0.000000 | 0.000000 | 0.107969 |
| 4 | JSON-java | feature | 7 | 0.142857 | 0.428571 | 0.017995 |
| 5 | JSON-java | fix | 66 | 0.030303 | 0.060606 | 0.169666 |
| 6 | JSON-java | other | 181 | 0.000000 | 0.000000 | 0.465296 |
| 7 | JSON-java | refactor | 6 | 0.166667 | 0.500000 | 0.015424 |
| 8 | JSON-java | release | 27 | 0.000000 | 0.000000 | 0.069409 |
| 9 | JSON-java | revert | 6 | 0.166667 | 2.500000 | 0.015424 |
| 10 | JSON-java | test | 24 | 0.000000 | 0.000000 | 0.061697 |
| 11 | assertj-core | breaking | 106 | 0.415094 | 67.849057 | 0.028867 |
| 12 | assertj-core | build | 169 | 0.248521 | 86.201183 | 0.046024 |
| 13 | assertj-core | deps | 753 | 0.007968 | 17.540505 | 0.205065 |
| 14 | assertj-core | docs | 231 | 0.290043 | 77.510823 | 0.062908 |
| 15 | assertj-core | feature | 122 | 0.319672 | 62.647541 | 0.033224 |
| 16 | assertj-core | fix | 266 | 0.421053 | 56.755639 | 0.072440 |
| 17 | assertj-core | other | 426 | 0.415493 | 163.873239 | 0.116013 |
| 18 | assertj-core | refactor | 67 | 0.597015 | 55.313433 | 0.018246 |
| 19 | assertj-core | release | 643 | 0.024883 | 11.555210 | 0.175109 |
# Evolution style map: intent composition (share) + policy map (break rate by intent)
selected_intents = ["feature", "fix", "refactor", "deps", "build", "release", "breaking", "docs", "test", "other"]
plot_intent = intent_stats[intent_stats["intent"].isin(selected_intents)].copy()
heat_share = plot_intent.pivot(index="library", columns="intent", values="intent_share").fillna(0)
heat_break_rate = plot_intent.pivot(index="library", columns="intent", values="break_commit_rate").fillna(0)
lib_order = coverage.sort_values("commits", ascending=False).index
heat_share = heat_share.reindex(lib_order)
heat_break_rate = heat_break_rate.reindex(lib_order)
fig, axes = plt.subplots(1, 2, figsize=(22, 10))
sns.heatmap(heat_share, cmap="Blues", ax=axes[0], cbar_kws={"label": "Share of commits"})
axes[0].set_title("Intent composition by library")
axes[0].set_xlabel("Intent")
axes[0].set_ylabel("Library")
sns.heatmap(heat_break_rate, cmap="Reds", ax=axes[1], cbar_kws={"label": "P(commit has BC rows)"})
axes[1].set_title("Break propensity by intent and library")
axes[1].set_xlabel("Intent")
axes[1].set_ylabel("")
plt.tight_layout()
plt.show()
Library Profiles and Comparative Mapping¶
We aggregate multi-dimensional features to characterize library evolution profiles and compare policy/style differences.
# Build profile features leveraging both commits and BC rows
nature_stats = bcs_df.groupby(["library", "nature"]).size().unstack(fill_value=0)
kind_diversity = bcs_df.groupby("library")["kind"].nunique().rename("bc_kind_diversity")
visibility_public_share = (
bcs_df.assign(is_public=bcs_df["symbol_visibility"].eq("public"))
.groupby("library")["is_public"].mean()
.rename("public_visibility_share")
)
monthly_break_vol = (
commits_df.assign(month=commits_df["date_utc"].dt.tz_convert(None).dt.to_period("M").astype(str))
.groupby(["library", "month"])["breaking_changes_count"].sum()
.groupby("library").std()
.fillna(0)
.rename("monthly_break_std")
)
profile = commit_enriched.groupby("library").agg(
commits=("commit", "count"),
share_break_commits=("has_bc_rows", "mean"),
mean_bc_rows_per_commit=("bc_rows", "mean"),
median_churn=("abs_loc_churn", "median"),
median_churn_break_commits=("abs_loc_churn", lambda s: s[commit_enriched.loc[s.index, "has_bc_rows"]].median() if commit_enriched.loc[s.index, "has_bc_rows"].any() else 0),
merge_rate=("is_merge_commit", "mean"),
tagged_commit_rate=("is_tagged_commit", "mean"),
pom_change_rate=("has_pom_changes", "mean"),
java_change_rate=("has_java_changes", "mean"),
median_days_since_prev=("days_since_prev_commit", "median"),
mean_internal_share=("internal_share", "mean"),
mean_deprecated_share=("deprecated_share", "mean"),
mean_binary_share_per_breaking_commit=("binary_share_in_commit", "mean"),
).join(flags.set_index("library")[["excluded_bcs_share", "internal_removals_share", "deprecated_removals_share"]], how="left")
profile = profile.join(kind_diversity, how="left")
profile = profile.join(visibility_public_share, how="left")
profile = profile.join(monthly_break_vol, how="left")
for col in ["deletion", "mutation", "addition"]:
if col not in nature_stats.columns:
nature_stats[col] = 0
nature_shares = nature_stats[["deletion", "mutation", "addition"]].div(nature_stats.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)
nature_shares.columns = ["nature_deletion_share", "nature_mutation_share", "nature_addition_share"]
profile = profile.join(nature_shares, how="left")
profile["bc_per_100_commits"] = profile["mean_bc_rows_per_commit"] * 100
profile["break_churn_multiplier"] = profile["median_churn_break_commits"] / profile["median_churn"].replace(0, np.nan)
profile["break_churn_multiplier"] = profile["break_churn_multiplier"].replace([np.inf, -np.inf], np.nan).fillna(0)
profile.head(20)
| commits | share_break_commits | mean_bc_rows_per_commit | median_churn | median_churn_break_commits | merge_rate | tagged_commit_rate | pom_change_rate | java_change_rate | median_days_since_prev | mean_internal_share | mean_deprecated_share | mean_binary_share_per_breaking_commit | excluded_bcs_share | internal_removals_share | deprecated_removals_share | bc_kind_diversity | public_visibility_share | monthly_break_std | nature_deletion_share | nature_mutation_share | nature_addition_share | bc_per_100_commits | break_churn_multiplier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| library | ||||||||||||||||||||||||
| JSON-java | 389 | 0.017995 | 0.452442 | 24.0 | 692.0 | 0.658098 | 0.064267 | 0.084833 | 0.735219 | 4.0 | 0.000000 | 0.000275 | 0.015544 | 0.000000 | 0.000000 | 0.000000 | 7 | 0.982955 | 13.581015 | 0.159091 | 0.840909 | 0.000000 | 45.244216 | 28.833333 |
| assertj-core | 3672 | 0.272603 | 62.846133 | 10.0 | 261.0 | 0.049564 | 0.016885 | 0.395969 | 0.512527 | 0.0 | 0.179138 | 0.007116 | 0.255184 | 0.042029 | 0.030303 | 0.006054 | 24 | 0.875786 | 2444.442170 | 0.592410 | 0.389741 | 0.017849 | 6284.613290 | 26.100000 |
| awaitility | 630 | 0.063492 | 0.415873 | 6.0 | 199.5 | 0.057143 | 0.060317 | 0.325397 | 0.298413 | 0.0 | 0.000000 | 0.000334 | 0.050452 | 0.000000 | 0.000000 | 0.003817 | 14 | 1.000000 | 9.565623 | 0.648855 | 0.297710 | 0.053435 | 41.587302 | 33.250000 |
| commons-beanutils | 2196 | 0.060565 | 1.040528 | 8.0 | 75.0 | 0.013661 | 0.000455 | 0.141166 | 0.518215 | 0.0 | 0.000000 | 0.010436 | 0.053713 | 0.000000 | 0.000000 | 0.041575 | 21 | 0.820569 | 53.498194 | 0.466083 | 0.525602 | 0.008315 | 104.052823 | 9.375000 |
| commons-cli | 1838 | 0.047334 | 0.577802 | 7.0 | 120.0 | 0.026115 | 0.001632 | 0.152884 | 0.431447 | 0.0 | 0.000000 | 0.049088 | 0.034920 | 0.000000 | 0.000000 | 0.000000 | 21 | 0.953861 | 36.877658 | 0.715631 | 0.245763 | 0.038606 | 57.780196 | 17.142857 |
| commons-codec | 3172 | 0.046028 | 0.293821 | 8.0 | 69.5 | 0.018600 | 0.001892 | 0.139029 | 0.559269 | 0.0 | 0.000000 | 0.058108 | 0.038974 | 0.000000 | 0.000000 | 0.008584 | 16 | 0.907725 | 20.824764 | 0.680258 | 0.313305 | 0.006438 | 29.382093 | 8.687500 |
| commons-collections | 4856 | 0.083608 | 2.051689 | 14.0 | 108.0 | 0.022035 | 0.001030 | 0.091639 | 0.678336 | 0.0 | 0.000000 | 0.003226 | 0.070279 | 0.000000 | 0.000000 | 0.001807 | 25 | 0.821841 | 316.188783 | 0.430192 | 0.552645 | 0.017164 | 205.168863 | 7.714286 |
| commons-compress | 5436 | 0.039183 | 0.252024 | 10.0 | 79.0 | 0.017476 | 0.001104 | 0.093635 | 0.733996 | 0.0 | 0.001397 | 0.014987 | 0.029115 | 0.000730 | 0.000730 | 0.000000 | 17 | 0.805109 | 21.136561 | 0.524818 | 0.452555 | 0.022628 | 25.202355 | 7.900000 |
| commons-io | 5560 | 0.033633 | 0.217446 | 9.0 | 145.0 | 0.016187 | 0.001799 | 0.107734 | 0.680576 | 0.0 | 0.000000 | 0.031862 | 0.028655 | 0.000000 | 0.000000 | 0.001654 | 20 | 0.892473 | 24.876858 | 0.493797 | 0.434243 | 0.071960 | 21.744604 | 16.111111 |
| commons-lang | 8614 | 0.045972 | 0.263989 | 10.0 | 105.5 | 0.024379 | 0.001509 | 0.086951 | 0.717088 | 0.0 | 0.000000 | 0.018381 | 0.040930 | 0.000000 | 0.000000 | 0.000440 | 22 | 0.933157 | 24.821353 | 0.619613 | 0.359719 | 0.020668 | 26.398886 | 10.550000 |
| commons-logging | 1826 | 0.033406 | 0.197700 | 8.0 | 92.0 | 0.036692 | 0.002738 | 0.171961 | 0.365279 | 0.0 | 0.000000 | 0.018995 | 0.027149 | 0.000000 | 0.000000 | 0.000000 | 13 | 0.850416 | 11.760604 | 0.764543 | 0.196676 | 0.038781 | 19.769989 | 11.500000 |
| commons-pool | 2804 | 0.085592 | 1.194722 | 8.0 | 81.5 | 0.016762 | 0.007133 | 0.166904 | 0.529601 | 0.0 | 0.000000 | 0.020874 | 0.073684 | 0.000000 | 0.000000 | 0.045373 | 17 | 0.862388 | 80.660140 | 0.640000 | 0.322985 | 0.037015 | 119.472183 | 10.187500 |
| commons-text | 2322 | 0.020241 | 0.123600 | 5.0 | 336.0 | 0.071921 | 0.004307 | 0.261413 | 0.444875 | 0.0 | 0.000428 | 0.012485 | 0.017980 | 0.010453 | 0.010453 | 0.048780 | 13 | 0.954704 | 10.949225 | 0.801394 | 0.135889 | 0.062718 | 12.360034 | 67.200000 |
| fastjson-core | 2934 | 0.109748 | 0.623381 | 36.0 | 165.5 | 0.140082 | 0.029993 | 0.103954 | 0.914110 | 0.0 | 0.000000 | 0.018740 | 0.101410 | 0.000000 | 0.000000 | 0.004921 | 23 | 0.953527 | 67.188583 | 0.827775 | 0.141607 | 0.030618 | 62.338105 | 4.597222 |
| fastjson2-core | 4597 | 0.079835 | 0.435066 | 41.0 | 288.0 | 0.048727 | 0.013922 | 0.219056 | 0.774636 | 0.0 | 0.046233 | 0.002812 | 0.069596 | 0.075500 | 0.068500 | 0.001500 | 20 | 0.862000 | 71.829450 | 0.620000 | 0.348500 | 0.031500 | 43.506635 | 7.024390 |
| gson | 1977 | 0.054628 | 0.237734 | 20.0 | 120.0 | 0.070309 | 0.013151 | 0.267577 | 0.669196 | 0.0 | 0.109508 | 0.016023 | 0.050835 | 0.751064 | 0.257447 | 0.000000 | 19 | 0.985106 | 14.360114 | 0.651064 | 0.338298 | 0.010638 | 23.773394 | 6.000000 |
| guava | 6972 | 0.054217 | 0.292169 | 31.0 | 122.5 | 0.000143 | 0.000717 | 0.055221 | 0.897734 | 0.0 | 0.032814 | 0.028148 | 0.040945 | 0.730486 | 0.092784 | 0.136475 | 20 | 0.974472 | 30.668046 | 0.548846 | 0.349043 | 0.102111 | 29.216867 | 3.951613 |
| h2-database | 8024 | 0.172483 | 4.238161 | 37.0 | 271.5 | 0.192547 | 0.002493 | 0.008724 | 0.871386 | 0.0 | 0.000000 | 0.000336 | 0.152342 | 0.000000 | 0.000000 | 0.000706 | 25 | 0.899285 | 471.502985 | 0.593995 | 0.386685 | 0.019320 | 423.816052 | 7.337838 |
| hamcrest-core | 621 | 0.077295 | 0.700483 | 20.0 | 80.5 | 0.069243 | 0.025765 | 0.000000 | 0.611916 | 0.0 | 0.053894 | 0.005599 | 0.057801 | 0.018391 | 0.018391 | 0.002299 | 13 | 0.937931 | 14.638984 | 0.434483 | 0.514943 | 0.050575 | 70.048309 | 4.025000 |
| hibernate-core | 4826 | 0.175300 | 7.095731 | 53.0 | 328.0 | 0.016162 | 0.009946 | 0.013054 | 0.846042 | 0.0 | 0.240281 | 0.007952 | 0.127562 | 0.156787 | 0.079634 | 0.021318 | 27 | 0.898026 | 1351.930108 | 0.666365 | 0.198341 | 0.135294 | 709.573145 | 6.188679 |
# Profile map via SVD on standardized features (no external ML deps)
feature_cols = [
"share_break_commits", "bc_per_100_commits", "merge_rate", "tagged_commit_rate", "pom_change_rate",
"mean_internal_share", "mean_deprecated_share", "excluded_bcs_share", "internal_removals_share",
"deprecated_removals_share", "bc_kind_diversity", "public_visibility_share", "monthly_break_std",
"nature_deletion_share", "nature_mutation_share", "nature_addition_share", "break_churn_multiplier",
]
X = profile[feature_cols].copy().fillna(0)
Xz = (X - X.mean(axis=0)) / X.std(axis=0).replace(0, 1)
U, S, Vt = np.linalg.svd(Xz.values, full_matrices=False)
coords = pd.DataFrame(U[:, :2] * S[:2], index=Xz.index, columns=["profile_x", "profile_y"])
profile_map = profile.join(coords)
plt.figure(figsize=(14, 10))
sns.scatterplot(
data=profile_map.reset_index(),
x="profile_x",
y="profile_y",
size="bc_per_100_commits",
hue="share_break_commits",
palette="viridis",
sizes=(50, 500),
alpha=0.85,
)
for row in profile_map.reset_index().itertuples(index=False):
plt.text(row.profile_x, row.profile_y, row.library, fontsize=9, alpha=0.85)
plt.title("Library profile map (dimension-reduced style/policy features)")
plt.xlabel("Profile axis 1")
plt.ylabel("Profile axis 2")
plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()
# Z-score comparison heatmap across all libraries
profile_z = Xz.loc[coverage.sort_values("commits", ascending=False).index]
plt.figure(figsize=(20, max(10, 0.45 * len(profile_z))))
sns.heatmap(profile_z, cmap="coolwarm", center=0, cbar_kws={"label": "Feature z-score"})
plt.title("Comparative profile heatmap (all libraries)")
plt.xlabel("Profile features")
plt.ylabel("Library")
plt.tight_layout()
plt.show()
Where Breaking Changes Concentrate¶
Concentration metrics show whether BCs are spread across many packages/types or focused in a few hotspots.
def hhi_from_counts(counts: pd.Series) -> float:
total = counts.sum()
if total <= 0:
return np.nan
shares = counts / total
return float((shares ** 2).sum())
pkg_conc = bcs_df.groupby(["library", "impacted_package_fqn"]).size().rename("count").reset_index()
pkg_hhi = pkg_conc.groupby("library")["count"].apply(hhi_from_counts).rename("package_hhi")
pkg_unique = pkg_conc.groupby("library")["impacted_package_fqn"].nunique().rename("unique_impacted_packages")
type_conc = bcs_df.groupby(["library", "impacted_type_fqn"]).size().rename("count").reset_index()
type_hhi = type_conc.groupby("library")["count"].apply(hhi_from_counts).rename("type_hhi")
type_unique = type_conc.groupby("library")["impacted_type_fqn"].nunique().rename("unique_impacted_types")
conc = (
coverage[["bc_rows"]]
.join(pkg_hhi, how="left")
.join(pkg_unique, how="left")
.join(type_hhi, how="left")
.join(type_unique, how="left")
.reset_index()
)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.scatterplot(
data=conc,
x="bc_rows",
y="package_hhi",
size="unique_impacted_packages",
hue="unique_impacted_packages",
palette="mako",
sizes=(30, 350),
ax=axes[0],
)
axes[0].set_xscale("log")
axes[0].set_title("Package concentration vs BC volume")
axes[0].set_xlabel("BC rows (log scale)")
axes[0].set_ylabel("HHI of impacted package distribution")
sns.scatterplot(
data=conc,
x="bc_rows",
y="type_hhi",
size="unique_impacted_types",
hue="unique_impacted_types",
palette="crest",
sizes=(30, 350),
ax=axes[1],
)
axes[1].set_xscale("log")
axes[1].set_title("Type concentration vs BC volume")
axes[1].set_xlabel("BC rows (log scale)")
axes[1].set_ylabel("HHI of impacted type distribution")
plt.tight_layout()
plt.show()
conc.sort_values("package_hhi", ascending=False).head(15)
| library | bc_rows | package_hhi | unique_impacted_packages | type_hhi | unique_impacted_types | |
|---|---|---|---|---|---|---|
| 33 | JSON-java | 176 | 1.000000 | 1 | 0.118221 | 15 |
| 23 | protobuf-java | 562 | 1.000000 | 1 | 0.055426 | 54 |
| 26 | hamcrest-core | 435 | 0.691552 | 4 | 0.053687 | 51 |
| 0 | assertj-core | 230771 | 0.614291 | 354 | 0.004927 | 10189 |
| 24 | gson | 470 | 0.446781 | 8 | 0.068139 | 103 |
| 27 | jakartaee-validation | 416 | 0.316753 | 11 | 0.018618 | 161 |
| 31 | slf4j-api | 249 | 0.314559 | 5 | 0.053370 | 49 |
| 8 | joda-time | 3676 | 0.308485 | 12 | 0.012665 | 233 |
| 28 | commons-logging | 361 | 0.305975 | 18 | 0.031177 | 110 |
| 10 | commons-pool | 3350 | 0.278548 | 12 | 0.024256 | 228 |
| 17 | log4j-api | 1546 | 0.273850 | 9 | 0.110659 | 131 |
| 9 | netty-codec-http | 3392 | 0.266669 | 7 | 0.012840 | 226 |
| 22 | jsoup | 919 | 0.238523 | 11 | 0.039584 | 146 |
| 15 | fastjson2-core | 2000 | 0.232882 | 26 | 0.023001 | 259 |
| 34 | jakartaee-servlet-api | 131 | 0.231047 | 7 | 0.018938 | 97 |
# Public/protected + excluded/internal interaction matrix by kind
interaction = bcs_df.copy()
interaction["group"] = np.select(
[
interaction["is_internal_removal"] & interaction["is_excluded_symbol"],
interaction["is_internal_removal"],
interaction["is_excluded_symbol"],
interaction["is_deprecated_removal"],
],
["internal+excluded", "internal", "excluded", "deprecated-removal"],
default="regular",
)
top_kinds = interaction["kind"].value_counts().index
mat = (
interaction[interaction["kind"].isin(top_kinds)]
.groupby(["kind", "group"]).size().rename("count").reset_index()
)
mat["share"] = mat.groupby("kind")["count"].transform(lambda s: s / s.sum())
heat = mat.pivot(index="kind", columns="group", values="share").fillna(0)
plt.figure(figsize=(12, 12))
sns.heatmap(heat, cmap="OrRd", annot=True, fmt=".2f", cbar_kws={"label": "Share within kind"})
plt.xticks(rotation=90)
plt.title("How BC kinds split across regular/excluded/internal/deprecated groups")
plt.xlabel("BC group")
plt.ylabel("BC kind")
plt.tight_layout()
plt.show()
Optional: Per-Library Deep Dive Helper¶
Set LIB_TO_INSPECT to quickly visualize the full timeline and BC composition of a specific library.
LIB_TO_INSPECT = "guava" # change as needed
dc = commits_df[commits_df["library"] == LIB_TO_INSPECT].sort_values("date_utc").copy()
db = bcs_enriched[bcs_enriched["library"] == LIB_TO_INSPECT].copy()
if dc.empty:
print(f"Library {LIB_TO_INSPECT} not found.")
else:
fig, axes = plt.subplots(2, 2, figsize=(18, 11))
axes[0, 0].plot(dc["date_utc"], dc["exported_symbols_count"], color="#1d3557", label="Exported")
axes[0, 0].plot(dc["date_utc"], dc["internal_count"], color="#2a9d8f", label="Internal")
axes[0, 0].set_title(f"{LIB_TO_INSPECT}: API size timeline")
axes[0, 0].legend()
axes[0, 1].plot(dc["date_utc"], dc["breaking_changes_count"].rolling(60, min_periods=5).mean(), color="#e76f51")
axes[0, 1].set_title(f"{LIB_TO_INSPECT}: rolling BC count (60 commits)")
kind_counts = db["kind"].value_counts().head(10)
sns.barplot(x=kind_counts.values, y=kind_counts.index, ax=axes[1, 0], color="#457b9d")
axes[1, 0].set_title("Top BC kinds")
axes[1, 0].set_xlabel("Count")
axes[1, 0].set_ylabel("")
comp_counts = db["compatibility"].value_counts(dropna=False)
axes[1, 1].pie(comp_counts.values, labels=comp_counts.index, autopct="%.1f%%")
axes[1, 1].set_title("Compatibility split")
plt.tight_layout()
plt.show()